#-------------------------------------------------------#
#   Quality Assessment of the Academic Freedom Index    #
#-------------------------------------------------------#

# Title: "Quality Assessment of the Academic Freedom Index: Strengths, Weaknesses, and How Best to Use It" 
# Authors: "Lott, Lars", "Spannagel, Janika"
# date: 2024-09-19
# journal: Perspectives on Politics
# DOI: TBA
# written under "R and RStudio (4.4.1)"

# original code written by Dan Pemstein and collgeues (McMann et al. 2022 Political Analysis)


#### Preliminaries ####

R.version$version.string

# clear workspace
rm(list=ls())

# load packages
library(here)
library(tidyverse)
library(ggpubr)
library(countrycode)
library(ggrepel)
library(ggthemes)
library(lubridate)
library(dotwhisker)
library(estimatr)
library(texreg)
library(readxl)
library(haven)

set.seed(1234)


#load coder-level data 
coder_level_ds <- read.csv(file.path("data/vdem_v13",  "vdem_13_coder_level", "coder_level_ds_v13.csv"))
gc()

#### Raw correlated errors (Appendix Section D, Tables D1, D2, and D3)

vnames <- c("v2cafres", "v2cafexch", "v2cainsaut", 
            "v2casurv", "v2clacfree")

dat <- coder_level_ds %>%
  dplyr::select(country_text_id, country_id, historical_date, coder_id, vnames)


### some info, to make sure we don't lose stuff along the way 

## Total pairwise coders (unique coders/var on the diagonal)
ucoders <- sapply(vnames, function (x)
  sapply(vnames, function (y)
    length(unique(dat$coder_id[!is.na(dat[,x]) & !is.na(dat[,y])]))))

## Make dataset long wrt codings, calculate averages
library(magrittr)
library(dplyr)
library(tidyr)
dat <- dat %>% pivot_longer(vnames, names_to="variable", values_to="code") %>%
  group_by(country_text_id, historical_date, variable) %>% 
  mutate(code.avg = mean(code, na.rm=T))

## Calculate "errors" or differences from means
dat <- dat %>% mutate(code.diff = code - code.avg)

## Make sure we have the right counts of diffs
ucoders.now <- sapply(vnames, function (x)
  length(unique(dat[dat$variable==x & !is.na(dat$code.diff),]$coder_id)))
diag(ucoders) == ucoders.now # should be all TRUE

## Calculate coder-level avg differences within variables, so within-coder/var avg error
dat <- dat %>% ungroup() %>% group_by(variable, coder_id) %>%
  mutate(code.diff.avg.w = mean(code.diff, na.rm=T))
dat$code.diff.avg.w[is.nan(dat$code.diff.avg.w)] <- NA

## Make sure we have the right counts of diffs
ucoders.now <- sapply(vnames, function (x)
  length(unique(dat[dat$variable==x & !is.na(dat$code.diff.avg.w),]$coder_id)))
diag(ucoders) == ucoders.now # should be all TRUE

## Create a coder-variable dataset of coder/var avg error
dat.cv <- dat %>% group_by(coder_id, variable) %>% summarise(avg.error = mean(code.diff.avg.w))

## widen this dataset so observation is coder, columns are variable avg errors
dat.cv <- dat.cv %>% pivot_wider(names_from="variable", values_from="avg.error")

###  Calculate within-coder correlations between avg errors across variables

## Appendix Section D, Table D1
table_D1 <- cor(dat.cv[,2:6],use="pairwise.complete")

tinytable::tt(as.data.frame(table_D1),  digits = 3) %>% save_tt("outputs_original_data/Table_D1.tex", overwrite = T)

summary(lm(v2cafres ~ v2cafexch, data=dat.cv))
summary(lm(v2cafres ~ v2cainsaut, data=dat.cv))
summary(lm(v2cafres ~ v2casurv, data=dat.cv))
summary(lm(v2cafres ~ v2clacfree, data=dat.cv))

## Counts of unique coders
ucoders.now <- sapply(vnames, function (x)
  sapply(vnames, function (y)
    length(unique(dat.cv$coder_id[!is.na(dat.cv[,x]) & !is.na(dat.cv[,y])]))))

## Counts of cross-coding
vars.coded <- apply(dat.cv, 1, function (x) sum(!is.na(x))-1)
sapply(1:5, function(x) sum(vars.coded==x))
sum(sapply(1:5, function(x) sum(vars.coded==x)))

## Appendix Section D, Table D3
tinytable::tt(as.data.frame(ucoders),  digits = 3) %>% save_tt("outputs_original_data/Table_D3.tex", overwrite = T)


#### Model-corrected correlated errors

### Load posteriors

v2cafexch <- readRDS("data/posteriors/v13/v2cafexch_posterior.rds") # freedom of academic exchange and dissemination
v2cafres <- readRDS("data/posteriors/v13/v2cafres_posterior.rds") # freedom to research and teach
v2cainsaut <- readRDS("data/posteriors/v13/v2cainsaut_posterior.rds") # institutional autonomy
v2casurv <- readRDS("data/posteriors/v13/v2casurv_posterior.rds") #campus integrity
v2clacfree <- readRDS("data/posteriors/v13/v2clacfree_posterior.rds") # freedom of academic and cultural expression

gc()
### Perceptions have the conditional distribution N(Z_i, \sigma_r^2), truncated to
### \gamma_{j,{y_{ir}-1, y_{ir}}}.
### \sigma_r^2 = (1/beta_r)^2 
### \gamma_{r,k} = \tau_{r,k} / \sigma_r
### i is the country-year, r is the rater
### truncates min(beta) at 0.07 to get rid of really annoying edge cases
library(rstan)
library(truncnorm)
library(abind)

gen.errors <- function (name, thin=4, minbeta=0.07) {
  post <- rstan::extract(get(name)$posterior)
  
  coder.table <- get(name)$post.sample$gamma %>%
    rownames %>% 
    grep("1[.]", ., value = TRUE) %>%
    gsub("1[.]", "", .) %>%
    data.frame(
      coder_id = as.numeric(.), 
      rater_id = seq_along(.), 
      row.names = NULL,
      stringsAsFactors = FALSE)
  
  y <- get(name)$model_input$y
  r <- get(name)$model_input$j_id
  i <- get(name)$model_input$sy_id
  post$beta[post$beta < minbeta] <- minbeta
  sigma <- 1/post$beta
  tau <- array(apply(post$gamma, 3, function (x) x * sigma), dim=dim(post$gamma))
  tau <- abind(matrix(-Inf, nrow=nrow(sigma), ncol=ncol(sigma)),
               tau,
               matrix(Inf, nrow=nrow(sigma), ncol=ncol(sigma)), along=3)
  cid <-  sapply(r, function (x) coder.table$coder_id[coder.table$rater_id == x])
  pcept <- sapply(seq(1, nrow(post$Z), thin), function (sim) {
    tau.obs <- sapply(1:length(y), function (obs) tau[sim, r[obs], y[obs]])
    taup1.obs <- sapply(1:length(y), function(obs) tau[sim, r[obs], y[obs]+1])
    rtruncnorm(length(y), tau.obs, taup1.obs,
               post$Z[sim,i], (sigma[sim, r])^2)
  })
  err <- t(post$Z[seq(1, nrow(post$Z), thin), i]) - pcept
  rownames(err) <-  cid
  return(err)
}

v2cafres.errors <- gen.errors("v2cafres")
v2cafexch.errors <- gen.errors("v2cafexch")
v2cainsaut.errors <- gen.errors("v2cainsaut")
v2casurv.errors <- gen.errors("v2casurv")
v2clacfree.errors <- gen.errors("v2clacfree")

errors <- sapply(paste(vnames, "errors", sep="."), function (var)
  apply(get(var), 1, mean))

raters <- unique(unlist(sapply(1:5, function (i) names(errors[[i]]))))

errors.avg <- sapply(1:5, function (i)
  sapply(raters, function (r)
    mean(errors[[i]][names(errors[[i]])==r])))

## Appendix Section D, Table D2
table_d2 <- round(cor(errors.avg, use="pairwise.complete.obs"), 3)

tt(as.data.frame(table_d2)) %>% save_tt("outputs_original_data/Table_D2.tex", overwrite = T)

